The UN speech that US president Donald Trump gave to the United Nations General Assembly on Sept. 19, 2017 was fascinating for several reasons. For me personally, it was interesting because the speech included complex sentences, and the content was much more complex than all previous talks that I’d heard from him. Several countries were mentioned in the talk, in different emotional contexts.

So I started to wondered which countries were mentioned in

However,

http://i0.kym-cdn.com/photos/images/newsfeed/000/519/726/2bf.gif

library(magrittr)
library(tidyverse)
library(rvest)
library(tidytext)
library(forcats)
library(wordcloud)
library(wordcloud2)
library(rworldmap)
library(stringr)
library(ggrepel)
library(hunspell)
library(igraph)
library(ggraph)
url <- "http://www.politico.com/story/2017/09/19/trump-un-speech-2017-full-text-transcript-242879"
speech_excerpt <- 
        read_html(url) %>% # Download whole homepage
        html_nodes("style~ p+ p , .lazy-load-slot+ p , .fixed-story-third-paragraph+ p , .story-related+ p , p~ p+ p") %>% # Select the required elements (by css selector)
        html_text() %>% # Make it text
        .[-2] %>% # Remove some random homepage text
        gsub("Mr.", "Mr", ., fixed = T) %>% # Make sure that dots in the text will not signify sentences
        gsub("United States of America", "usa", .) %>% # USA has to be preserved as one expression
        gsub("United States", "usa", .) %>% 
        gsub("America", "usa", .) %>% 
        gsub("States", "usa", .) %>% 
        gsub("North Korea", "north_korea", .) %>% # North Korea should be preserved as one word for now
        data_frame(paragraph = .)
                        
speech_sentences <- 
        speech_excerpt %>% 
        unnest_tokens(sentence, paragraph, token = "sentences")
        
speech_words <- 
        speech_excerpt %>% 
        unnest_tokens(word, paragraph, token = "words") %>% 
        mutate(word = gsub("_", " ", word)) %>% 
        # Here comes a nasty manual stemming of country names. Sadly, I failed to get satisfactory results  on country names with standard stemmers (I tried snowballC, hunspell, and textstem). I also tried to create a custom dictionary with added country names to no avail. What am I missing? Anyway, this works.
        mutate(word = word %>% 
                       str_replace_all("'s$","") %>% # Cut 's
                       if_else(. == "iranian", "iran", .) %>% 
                       if_else(. %in% c("usans", "north koreans"), str_replace(., "ns$",""),.) %>% 
                       if_else(. %in% c("usan","syrian","african","cuban","venezuelan"), str_replace(., "n$",""),.)
        )
speech_words %>% 
        anti_join(stop_words, by = "word") %>% 
        count(word, sort = TRUE) %>% 
        wordcloud2()

        # wordcloud2(figPath = "trump.png") # Image downloaded from http://westviewnews.org/wp-content/uploads/2017/01/donald-trump-profile-silhouette-vector-graphic_template_1469623874993O0L.png
# Check emotional words that were uttered at least 3 times
speech_words %>% 
        count(word, sort = TRUE) %>% 
        inner_join(get_sentiments("bing"), by = "word") %>% 
        filter(n > 3) %>% 
        mutate(n = if_else(sentiment == "negative", -n, n)) %>% 
        ggplot() +
                aes(y = n, x = fct_reorder(word, n), fill = sentiment) +
                geom_col() +
                coord_flip() +
                labs(x = "word", y ="Occurance in speech") +
                ggtitle("Most common words in Trump's 17/09/19 UN speech by sentiment")

Comparison cloud of all words

speech_words %>%
        inner_join(get_sentiments("bing"), by = "word") %>% 
        count(word, sentiment, sort = TRUE) %>%
        spread(sentiment, n, fill = 0L) %>%
        as.data.frame() %>% 
        remove_rownames() %>% 
        column_to_rownames("word") %>% 
        comparison.cloud(colors = c("red", "blue"))

Let’s look into specific emotions using the nrc sentiment dictionary

It is also possible to , look at the words associated with distinct emotions in the whole speech.

speech_words %>%
        inner_join(get_sentiments("nrc"), by = "word") %>% # Use distinct emotion dictionary
        filter(!sentiment %in% c("positive","negative")) %>% # Only look for distinct emotions
        group_by(sentiment) %>% 
        count(sentiment, sort = T) %>% 
        ggplot() +
                aes(x = fct_reorder(sentiment %>% str_to_title, -n), y = n, label = n) +
                geom_col() +
                geom_label(vjust = 1) +
                theme_minimal() +
                labs(title = "The occurance of words linked to distinct emotions in the speech", x = "Word", y = "Frequency")

Let’s put the sentiments on a map

First, let’s see which countries were mentioned and how many times. Obviously, the (United States of) America is first! Iran, Venezuela, North Korea were also mentioned more than 5 times.

# Load map database
map_world <- 
        map_data(map="world") %>% 
        mutate(region = region %>% str_to_lower()) # Make country name lower case to match word
# Calculate mentions of a country, and join geodata
trump_countries <-
        speech_words %>% 
        count(word) %>% 
        right_join(map_world, by = c("word" = "region")) %>% # Match country coordinates to speech
        select(region = word, everything())
# Get country names with the middle of the country coordinates
country_names <- 
        trump_countries %>% 
        drop_na(n) %>%
        group_by(region) %>% 
        summarise(lat = mean(lat),
                  long = mean(long))

Map countries with the number of mentions (USA excluded)

How does this look like on a map? We concentrate on non-US countries, as the USA would outweigh all differences between other countries.

trump_countries %>% 
        ggplot() +
        aes(map_id = region, 
            x = long, 
            y = lat, 
            label = paste0(region %>% str_to_title(),": ", n)) +
        geom_map(aes(fill = n %>% log10(.)), 
                 map = trump_countries) +
        geom_label_repel(data = trump_countries %>% 
                                 drop_na(n) %>% 
                                 group_by(region) %>% 
                                 slice(1), 
                         alpha = .75) +
        scale_fill_gradient(low = "lightblue", 
                            high = "darkblue", 
                            na.value = "grey90") +
        labs(title = "Number of mentions by country", 
             x = "Longitude", 
             y = "Latitude") +
        theme_minimal() +
        theme(legend.position = "none")

# Sentiment of each sentence
sentence_sentiment <-
speech_sentences %>% 
        mutate(sentence_num = row_number(),
               sentence_length = length(sentence)
        ) %>% 
        unnest_tokens(word, sentence, "words") %>% 
        mutate(word = gsub("_", " ", word)) %>% 
        # Here comes a nasty manual stemming of country names. Sadly, I failed to get satisfactory results  on country names with standard stemmers (I tried snowballC, hunspell, and textstem). I also tried to create a custom dictionary with added country names to no avail. What am I missing? Anyway, this works.        
        mutate(word = word %>% 
                       str_replace_all("'s$","") %>% # Cut 's
                       if_else(. == "iranian", "iran", .) %>% 
                       if_else(. %in% c("usans", "north koreans"), str_replace(., "ns$",""),.) %>% 
                       if_else(. %in% c("usan","syrian","african","cuban","venezuelan"), str_replace(., "n$",""),.)
        ) %>% 
        left_join(get_sentiments("bing"), by = "word") %>%
        mutate(sentiment_score = case_when(sentiment == "positive" ~ 1,
                                           sentiment == "negative" ~ -1,
                                           is.na(sentiment) ~ NA_real_)) %>%
        group_by(sentence_num) %>%
        summarise(sum_sentiment = sum(sentiment_score, na.rm = T),
                  sentence = paste(word, collapse = " "))
# Which sentence has a country name
country_sentence <- 
        speech_sentences %>% 
        mutate(sentence_num = row_number()) %>% 
        unnest_tokens(word, sentence, "words") %>% 
        mutate(word = gsub("_", " ", word)) %>% 
        right_join(country_names %>% select(region), by = c("word" = "region")) %>% 
        arrange(sentence_num)
# Sentiment for each country
country_sentiment <-         
        sentence_sentiment %>% 
        full_join(country_sentence, by = "sentence_num") %>% 
        select(region = word, sum_sentiment) %>% 
        drop_na() %>% 
        group_by(region) %>% 
        summarise(country_sentiment = sum(sum_sentiment, na.rm = T))

So, How about looking into the text and and check

sentence_sentiment %>% 
        full_join(country_sentence) %>% 
        mutate(sentiment_type = case_when(sum_sentiment >0 ~ "positive",
                                          sum_sentiment <0 ~ "negative",
                                          sum_sentiment == 0 ~ "neutral") %>% 
                                fct_rev()
                       ) %>% 
        
        ggplot() +
                aes(x = sentence_num, 
                    y = sum_sentiment, 
                    label = word %>% str_to_title()) +
                geom_hline(yintercept = 0, 
                           color = "grey", 
                           linetype = "dashed", 
                           size = 1.2) +
                geom_smooth(span = 0.05, 
                            se = T, 
                            size = 1.2, 
                            color = "black") +
                geom_label_repel(aes(fill = sentiment_type), 
                                 alpha = .8, 
                                 segment.alpha = 0) +
                scale_fill_manual(values = c("green","grey","red")) +
                theme_minimal() +
                labs(x = "Sentence number", 
                     y = "Sentence sentiment", 
                     title = "The summarised sentiment of sentences, and the appearance of country names in the speech \nby sentiment in sentence order",
                     subtitle = "The dashed line signifies neutral sentence sentiment. \nCountry label colors show the direction of the sentiment (positive/negative)") 

sentiment_map_data <- 
        trump_countries %>% 
        left_join(country_sentiment, by = "region")
sentiment_map_data %>% 
        mutate(country_sentiment = if_else(region == "usa", NA_real_, country_sentiment)) %>% # Exclude US
        ggplot() +
                aes(    map_id = region, 
                        x = long, 
                        y = lat, 
                        label = paste0(region %>% str_to_title(), ": ", country_sentiment)
                        ) +
                geom_map(aes(fill = country_sentiment), 
                         map = trump_countries) +
                scale_fill_gradient(high = "green", 
                                    low = "red", 
                                    na.value = "grey90") +
                geom_label_repel(data = sentiment_map_data %>%
                                         drop_na(n) %>%
                                         group_by(region) %>%
                                         slice(1),
                                         alpha = .5
                                 ) +
                theme_minimal() +
                labs(title = "Sentiment of the sentences where countries were mentioned (USA excluded)", 
                     x = "Longitude", 
                     y = "Latitude")

---
title: "Is this the beginning of the end? - Text mining Donald Trumps UN speech"
output: html_notebook
---

The UN speech that US president Donald Trump gave to the United Nations General Assembly on Sept. 19, 2017 was fascinating for several reasons. For me personally, it was interesting because the speech included complex sentences, and the content was much more complex than all previous talks that I'd heard from him. Several countries were mentioned in the talk, in different emotional contexts. 

So I started to wondered which countries were mentioned in 

However, 

![](http://i0.kym-cdn.com/photos/images/newsfeed/000/519/726/2bf.gif)

http://i0.kym-cdn.com/photos/images/newsfeed/000/519/726/2bf.gif

```{r message=FALSE, warning=FALSE}
library(magrittr)
library(tidyverse)
library(rvest)
library(tidytext)
library(forcats)
library(wordcloud)
library(wordcloud2)
library(rworldmap)
library(stringr)
library(ggrepel)
library(hunspell)
library(igraph)
library(ggraph)
```

```{r}
url <- "http://www.politico.com/story/2017/09/19/trump-un-speech-2017-full-text-transcript-242879"
speech_excerpt <- 
        read_html(url) %>% # Download whole homepage
        html_nodes("style~ p+ p , .lazy-load-slot+ p , .fixed-story-third-paragraph+ p , .story-related+ p , p~ p+ p") %>% # Select the required elements (by css selector)
        html_text() %>% # Make it text
        .[-2] %>% # Remove some random homepage text
        gsub("Mr.", "Mr", ., fixed = T) %>% # Make sure that dots in the text will not signify sentences
        gsub("United States of America", "usa", .) %>% # USA has to be preserved as one expression
        gsub("United States", "usa", .) %>% 
        gsub("America", "usa", .) %>% 
        gsub("States", "usa", .) %>% 
        gsub("North Korea", "north_korea", .) %>% # North Korea should be preserved as one word for now
        data_frame(paragraph = .)
                        
speech_sentences <- 
        speech_excerpt %>% 
        unnest_tokens(sentence, paragraph, token = "sentences")
        
speech_words <- 
        speech_excerpt %>% 
        unnest_tokens(word, paragraph, token = "words") %>% 
        mutate(word = gsub("_", " ", word)) %>% 
        # Here comes a nasty manual stemming of country names. Sadly, I failed to get satisfactory results  on country names with standard stemmers (I tried snowballC, hunspell, and textstem). I also tried to create a custom dictionary with added country names to no avail. What am I missing? Anyway, this works.
        mutate(word = word %>% 
                       str_replace_all("'s$","") %>% # Cut 's
                       if_else(. == "iranian", "iran", .) %>% 
                       if_else(. %in% c("usans", "north koreans"), str_replace(., "ns$",""),.) %>% 
                       if_else(. %in% c("usan","syrian","african","cuban","venezuelan"), str_replace(., "n$",""),.)
        )
```


```{r fig.height=8, fig.width=8, message=FALSE, warning=FALSE}
speech_words %>% 
        anti_join(stop_words, by = "word") %>% 
        count(word, sort = TRUE) %>% 
        wordcloud2()
        # wordcloud2(figPath = "trump.png") # Image downloaded from http://westviewnews.org/wp-content/uploads/2017/01/donald-trump-profile-silhouette-vector-graphic_template_1469623874993O0L.png
```


```{r fig.width=10}
# Check emotional words that were uttered at least 3 times
speech_words %>% 
        count(word, sort = TRUE) %>% 
        inner_join(get_sentiments("bing"), by = "word") %>% 
        filter(n > 3) %>% 
        mutate(n = if_else(sentiment == "negative", -n, n)) %>% 
        ggplot() +
                aes(y = n, x = fct_reorder(word, n), fill = sentiment) +
                geom_col() +
                coord_flip() +
                labs(x = "word", 
                     y ="Occurance in speech",
                     title = "Most common words in Trump's 17/09/19 UN speech by sentiment")
```
# Comparison cloud of all words
```{r fig.height=8, fig.width=8}
speech_words %>%
        inner_join(get_sentiments("bing"), by = "word") %>% 
        count(word, sentiment, sort = TRUE) %>%
        spread(sentiment, n, fill = 0L) %>%
        as.data.frame() %>% 
        remove_rownames() %>% 
        column_to_rownames("word") %>% 
        comparison.cloud(colors = c("red", "blue"))
```

# Let's look into specific emotions using the nrc sentiment dictionary

It is also possible to , look at the words associated with distinct emotions in the whole speech.

```{r}
speech_words %>%
        inner_join(get_sentiments("nrc"), by = "word") %>% # Use distinct emotion dictionary
        filter(!sentiment %in% c("positive","negative")) %>% # Only look for distinct emotions
        group_by(sentiment) %>% 
        count(sentiment, sort = T) %>% 
        ggplot() +
                aes(x = fct_reorder(sentiment %>% str_to_title, -n), 
                    y = n, 
                    label = n) +
                geom_col() +
                geom_label(vjust = 1) +
                theme_minimal() +
                labs(title = "The occurance of words linked to distinct emotions in the speech", 
                     x = "Word", 
                     y = "Frequency")
```

# Let's put the sentiments on a map

First, let's see which countries were mentioned and how many times. Obviously, the (United States of) America is first! Iran, Venezuela, North Korea were also mentioned more than 5 times.

```{r message=FALSE, warning=FALSE}
# Load map database
map_world <- 
        map_data(map="world") %>% 
        mutate(region = region %>% str_to_lower()) # Make country name lower case to match word

# Calculate mentions of a country, and join geodata
trump_countries <-
        speech_words %>% 
        count(word) %>% 
        right_join(map_world, by = c("word" = "region")) %>% # Match country coordinates to speech
        select(region = word, everything())

# Get country names with the middle of the country coordinates
country_names <- 
        trump_countries %>% 
        drop_na(n) %>%
        group_by(region) %>% 
        summarise(lat = mean(lat),
                  long = mean(long))

```
# Map countries with the number of mentions (USA excluded)

How does this look like on a map? We concentrate on non-US countries, as the USA would outweigh all differences between other countries.

```{r fig.width=10, message=FALSE, warning=FALSE}
trump_countries %>% 
        ggplot() +
        aes(map_id = region, 
            x = long, 
            y = lat, 
            label = paste0(region %>% str_to_title(),": ", n)) +
        geom_map(aes(fill = n %>% log10(.)), 
                 map = trump_countries) +
        geom_label_repel(data = trump_countries %>% 
                                 drop_na(n) %>% 
                                 group_by(region) %>% 
                                 slice(1), 
                         alpha = .75) +
        scale_fill_gradient(low = "lightblue", 
                            high = "darkblue", 
                            na.value = "grey90") +
        labs(title = "Number of mentions by country", 
             x = "Longitude", 
             y = "Latitude") +
        theme_minimal() +
        theme(legend.position = "none")


```

```{r}
# Sentiment of each sentence
sentence_sentiment <-
speech_sentences %>% 
        mutate(sentence_num = row_number(),
               sentence_length = length(sentence)
        ) %>% 
        unnest_tokens(word, sentence, "words") %>% 
        mutate(word = gsub("_", " ", word)) %>% 
        # Here comes a nasty manual stemming of country names. Sadly, I failed to get satisfactory results  on country names with standard stemmers (I tried snowballC, hunspell, and textstem). I also tried to create a custom dictionary with added country names to no avail. What am I missing? Anyway, this works.        
        mutate(word = word %>% 
                       str_replace_all("'s$","") %>% # Cut 's
                       if_else(. == "iranian", "iran", .) %>% 
                       if_else(. %in% c("usans", "north koreans"), str_replace(., "ns$",""),.) %>% 
                       if_else(. %in% c("usan","syrian","african","cuban","venezuelan"), str_replace(., "n$",""),.)
        ) %>% 
        left_join(get_sentiments("bing"), by = "word") %>%
        mutate(sentiment_score = case_when(sentiment == "positive" ~ 1,
                                           sentiment == "negative" ~ -1,
                                           is.na(sentiment) ~ NA_real_)) %>%
        group_by(sentence_num) %>%
        summarise(sum_sentiment = sum(sentiment_score, na.rm = T),
                  sentence = paste(word, collapse = " "))

# Which sentence has a country name
country_sentence <- 
        speech_sentences %>% 
        mutate(sentence_num = row_number()) %>% 
        unnest_tokens(word, sentence, "words") %>% 
        mutate(word = gsub("_", " ", word)) %>% 
        right_join(country_names %>% select(region), by = c("word" = "region")) %>% 
        arrange(sentence_num)

# Sentiment for each country
country_sentiment <-         
        sentence_sentiment %>% 
        full_join(country_sentence, by = "sentence_num") %>% 
        select(region = word, sum_sentiment) %>% 
        drop_na() %>% 
        group_by(region) %>% 
        summarise(country_sentiment = sum(sum_sentiment, na.rm = T))

```

So, How about looking into the text and and check 

```{r fig.width=10, message=FALSE, warning=FALSE}
sentence_sentiment %>% 
        full_join(country_sentence) %>% 
        mutate(sentiment_type = case_when(sum_sentiment >0 ~ "positive",
                                          sum_sentiment <0 ~ "negative",
                                          sum_sentiment == 0 ~ "neutral") %>% 
                                fct_rev()
                       ) %>% 
        
        ggplot() +
                aes(x = sentence_num, 
                    y = sum_sentiment, 
                    label = word %>% str_to_title()) +
                geom_hline(yintercept = 0, 
                           color = "grey", 
                           linetype = "dashed", 
                           size = 1.2) +
                geom_smooth(span = 0.05, 
                            se = T, 
                            size = 1.2, 
                            color = "black") +
                geom_label_repel(aes(fill = sentiment_type), 
                                 alpha = .8, 
                                 segment.alpha = 0) +
                scale_fill_manual(values = c("green","grey","red")) +
                theme_minimal() +
                labs(x = "Sentence number", 
                     y = "Sentence sentiment", 
                     title = "The summarised sentiment of sentences, and the appearance of country names in the speech \nby sentiment in sentence order",
                     subtitle = "The dashed line signifies neutral sentence sentiment. \nCountry label colors show the direction of the sentiment (positive/negative)") 

```


```{r fig.width=10, message=FALSE, warning=FALSE}
sentiment_map_data <- 
        trump_countries %>% 
        left_join(country_sentiment, by = "region")

sentiment_map_data %>% 
        mutate(country_sentiment = if_else(region == "usa", NA_real_, country_sentiment)) %>% # Exclude US
        ggplot() +
                aes(    map_id = region, 
                        x = long, 
                        y = lat, 
                        label = paste0(region %>% str_to_title(), ": ", country_sentiment)
                        ) +
                geom_map(aes(fill = country_sentiment), 
                         map = trump_countries) +
                scale_fill_gradient(high = "green", 
                                    low = "red", 
                                    na.value = "grey90") +
                geom_label_repel(data = sentiment_map_data %>%
                                         drop_na(n) %>%
                                         group_by(region) %>%
                                         slice(1),
                                         alpha = .5
                                 ) +
                theme_minimal() +
                labs(title = "Sentiment of the sentences where countries were mentioned (USA excluded)", 
                     x = "Longitude", 
                     y = "Latitude")

```

